
import pandas as pd
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

## ==================================================================================================================
## Generate omission panel for citations added by examiners.
## Zihao Li. 06/2024

## Inputs:  rawdata/g_us_patent_citation.csv
##          cleandata/sim_score_1981_2015_top5.csv

## Output:  temp/actual_citation_lst_examiner.csv
##          temp/omission_panel5_examiner.csv

## For applicant vs. examiner citation details, see https://patentsview.org/forum/7/topic/123
## ==================================================================================================================


def main():

    ## Load actual citation data
    print('Loading actual citation data...')
    df_actual = parallel_read_csv(dir + 'rawdata/g_us_patent_citation.tsv', file_type='tsv')
    print('Shape of actual_citation dataset is {}.'.format(df_actual.shape)) # Shape (128401915, 7)

    ## Remove non-numeric patents
    print('Removing non-numeric patents...')
    df_actual = df_actual[~df_actual["patent_id"].str.contains("[a-zA-Z]")]
    mask = df_actual["citation_patent_id"].str.contains("[^0-9.]")
    mask = mask.astype(bool)
    df_actual = df_actual[~mask]
    df_actual["patent_id"] = pd.to_numeric(df_actual["patent_id"], errors="raise")
    df_actual["citation_patent_id"] = pd.to_numeric(df_actual["citation_patent_id"], errors="raise")
    print('Shape of actual_citation dataset (after removing non-numeric patents) is {}.'.format(df_actual.shape)) # (113603267, 7)

    ## Generate actual citation list
    print('Generating actual citation list...')
    df_actual_lst = df_actual.groupby('patent_id').agg({'citation_patent_id': lambda x: x.tolist()}).reset_index().sort_values(by=['patent_id'])
    df_actual_lst['citation_patent_id'] = [row[0] if isinstance(row[0], list) else row for row in df_actual_lst['citation_patent_id']]
    df_actual_lst = df_actual_lst.rename(columns={'citation_patent_id': 'actual_citation_list'})
    print('Shape of actual_citation dataset (after grouping by patent_id) is {}.'.format(df_actual_lst.shape)) # (6719417, 2)

    ## Keep only the patents in "actual_citation_list" variable that are cited by EXAMINERS
    df_filtered = df_actual[df_actual['citation_category'] == 'cited by examiner']
    examiner_citation_dict = df_filtered.groupby('patent_id')['citation_patent_id'].apply(list).to_dict()

    # Function to update the "actual_citation_list" variable in the df_actual_lst dataframe
    def update_citation_list(row):
        patent_id = row['patent_id']
        if patent_id in examiner_citation_dict: # at least one cited by examiner
            return examiner_citation_dict[patent_id]
        else:
            return [] # none cited by examiner

    # Update the "actual_citation_list" variable in the df_actual_lst dataframe
    df_actual_lst['actual_citation_list'] = df_actual_lst.apply(update_citation_list, axis=1)
    del df_filtered, examiner_citation_dict

    ## Generate num_citations variable for df_actual_lst (examiners only)
    df_actual_lst['num_citations_examiners'] = df_actual_lst["actual_citation_list"].apply(lambda x: len(x))
    # Export df_actual_lst for examiners
    print('Exporting df_actual_lst for examiners...')
    df_actual_lst.to_csv(dir + 'temp/actual_citation_lst_examiner.csv', index=False)
    df_actual_lst = df_actual_lst.drop(columns=['num_citations_examiners'])

    ## Merge with artificial citation data
    print('Loading artificial citation data...')
    df_artificial = parallel_read_csv(dir + 'cleandata/sim_score_1981_2015_top5.csv', file_type='csv')
    df_artificial = df_artificial.drop(columns=['patent_idx', 'cited_patent_idx', 'patent_year', 'cited_patent_year'])
    print('Shape of artificial_citation dataset is {}.'.format(df_artificial.shape)) # (24766255, 3)

    print('Merging actual_citation_list_examiner with artificial citation data...')
    df = df_artificial.merge(df_actual_lst, how='left', on='patent_id'); del df_actual_lst, df_artificial 
    print('Shape of merged dataset is {}.'.format(df.shape)) # (24766255, 4)

    ## Generate omission index
    print('Generating omission index...')
    df = df.dropna(subset=['actual_citation_list'])
    df['omission'] = df.apply(lambda row: 0 if row['cited_patent_id'] in row['actual_citation_list'] else 1, axis=1)
    print('Shape of omission dataset is {}.'.format(df.shape)) # (23754000, 5)

    ## Export omission panel
    print('Exporting examiner omission panel...')
    df.to_csv(dir + 'temp/omission_panel5_examiner.csv', index=False)


if __name__ == "__main__":
    main()